Here we have dataset complied by Randal Olson, a data scientist at University of Pennsylvania. Through this data set we will explore the gender gap in STEM fields(Science, Technology, Engineering, and Mathematics). This gap is reported on often in the news and not everyone agrees that there is a gap.
Here, we'll explore how we can communicate the nuanced narrative of gender gap using effective data visualization.
%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
women_degrees = pd.read_csv('percent-bachelors-degrees-women-usa.csv')
women_degrees.head()
Year | Agriculture | Architecture | Art and Performance | Biology | Business | Communications and Journalism | Computer Science | Education | Engineering | English | Foreign Languages | Health Professions | Math and Statistics | Physical Sciences | Psychology | Public Administration | Social Sciences and History | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1970 | 4.229798 | 11.921005 | 59.7 | 29.088363 | 9.064439 | 35.3 | 13.6 | 74.535328 | 0.8 | 65.570923 | 73.8 | 77.1 | 38.0 | 13.8 | 44.4 | 68.4 | 36.8 |
1 | 1971 | 5.452797 | 12.003106 | 59.9 | 29.394403 | 9.503187 | 35.5 | 13.6 | 74.149204 | 1.0 | 64.556485 | 73.9 | 75.5 | 39.0 | 14.9 | 46.2 | 65.5 | 36.2 |
2 | 1972 | 7.420710 | 13.214594 | 60.4 | 29.810221 | 10.558962 | 36.6 | 14.9 | 73.554520 | 1.2 | 63.664263 | 74.6 | 76.9 | 40.2 | 14.8 | 47.6 | 62.6 | 36.1 |
3 | 1973 | 9.653602 | 14.791613 | 60.2 | 31.147915 | 12.804602 | 38.4 | 16.4 | 73.501814 | 1.6 | 62.941502 | 74.9 | 77.4 | 40.9 | 16.5 | 50.4 | 64.3 | 36.4 |
4 | 1974 | 14.074623 | 17.444688 | 61.9 | 32.996183 | 16.204850 | 40.5 | 18.9 | 73.336811 | 2.2 | 62.413412 | 75.3 | 77.9 | 41.8 | 18.2 | 52.6 | 66.1 | 37.3 |
women_degrees.tail()
Year | Agriculture | Architecture | Art and Performance | Biology | Business | Communications and Journalism | Computer Science | Education | Engineering | English | Foreign Languages | Health Professions | Math and Statistics | Physical Sciences | Psychology | Public Administration | Social Sciences and History | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
37 | 2007 | 47.605026 | 43.100459 | 61.4 | 59.411993 | 49.000459 | 62.5 | 17.6 | 78.721413 | 16.8 | 67.874923 | 70.2 | 85.4 | 44.1 | 40.7 | 77.1 | 82.1 | 49.3 |
38 | 2008 | 47.570834 | 42.711730 | 60.7 | 59.305765 | 48.888027 | 62.4 | 17.8 | 79.196327 | 16.5 | 67.594028 | 70.2 | 85.2 | 43.3 | 40.7 | 77.2 | 81.7 | 49.4 |
39 | 2009 | 48.667224 | 43.348921 | 61.0 | 58.489583 | 48.840474 | 62.8 | 18.1 | 79.532909 | 16.8 | 67.969792 | 69.3 | 85.1 | 43.3 | 40.7 | 77.1 | 82.0 | 49.4 |
40 | 2010 | 48.730042 | 42.066721 | 61.3 | 59.010255 | 48.757988 | 62.5 | 17.6 | 79.618625 | 17.2 | 67.928106 | 69.0 | 85.0 | 43.1 | 40.2 | 77.0 | 81.7 | 49.3 |
41 | 2011 | 50.037182 | 42.773438 | 61.2 | 58.742397 | 48.180418 | 62.2 | 18.2 | 79.432812 | 17.5 | 68.426730 | 69.5 | 84.8 | 43.1 | 40.1 | 76.7 | 81.9 | 49.2 |
The Data above shows percentage of Bachelor Degrees awarded per Subject to women from year 1970 to 2011.
fig = plt.figure()
plt.plot(women_degrees["Year"], women_degrees['Biology'], c='blue', label='Women')
plt.plot(women_degrees["Year"], 100 - women_degrees['Biology'], c='green', label= "Men")
plt.title("Percentage of Biology Degrees Awarded By Gender")
plt.legend(loc="upper right")
plt.show()
fig = plt.figure()
plt.plot(women_degrees['Year'], women_degrees['Biology'], label='Women', c='blue')
plt.plot(women_degrees['Year'], 100 - women_degrees['Biology'], c='green', label='Men')
# Hiding Tick Marks
plt.tick_params(top = False, bottom = False, right = False, left = False)
plt.title("Percentage of Biology Degrees Awarded By Gender")
plt.legend(loc="upper right")
plt.show()
fig, ax = plt.subplots()
ax.plot(women_degrees['Year'], women_degrees['Biology'], label='Women', c='blue')
ax.plot(women_degrees['Year'], 100-women_degrees['Biology'], label='Men', c='green')
ax.tick_params(top = False, bottom = False, right = False, left = False)
# Removing Spines
# ax.spines['right'].set_visible(False)
# ax.spines['top'].set_visible(False)
# ax.spines['bottom'].set_visible(False)
# ax.spines['left'].set_visible(False)
# We can use either of them
for key, spine in ax.spines.items():
spine.set_visible(False)
ax.legend(loc='upper right')
ax.set_title('Percentage of Biology Degrees Awarded By Gender')
plt.savefig('new.png')
plt.show()
major_cats = ['Biology', 'Computer Science', 'Engineering', 'Math and Statistics']
fig = plt.figure(figsize=(12, 12))
for sp in range(0,4):
ax = fig.add_subplot(2,2,sp+1)
ax.plot(women_degrees['Year'], women_degrees[major_cats[sp]], c='blue', label='Women')
ax.plot(women_degrees['Year'], 100-women_degrees[major_cats[sp]], c='green', label='Men')
# Formatting Axes Objects
ax.set_xlim(1968, 2011)
ax.set_ylim(0, 100)
for key, spine in ax.spines.items():
spine.set_visible(False)
ax.tick_params(top = False, bottom = False, right = False, left = False)
ax.set_title(major_cats[sp])
# Calling pyplot.legend() here will add the legend to the last subplot that was created.
plt.legend(loc='upper right')
plt.show()
We can conclude that Computer Science and Engineering have big gender gaps, while the gap in Biology and Math and Statistics is quite small. In addition, the first two degree categories are dominated by men while the latter degree categories are much more balanced.
In order to publish the data visualizations that we create, we have to be mindful of color blindness.
Thankfully, there are color palettes we can use that are friendly for people with color blindness. One of them is called Color Blind 10 and was released by Tableau, the company that makes the data visualization platform of the same name. We can navigate to this page and select the Color Blind 10 option from the list of palettes to see the ten colors included in the palette.
fig = plt.figure(figsize=(12, 12))
for sp in range(0,4):
ax = fig.add_subplot(2,2,sp+1)
# The color for each line is assigned here.
cb_dark_blue = (0/255, 107/255, 164/255)
cb_orange = (255/255, 128/255, 14/255)
# Formatting Axes Objects
ax.plot(women_degrees['Year'], women_degrees[major_cats[sp]], c=cb_dark_blue, label='Women')
ax.plot(women_degrees['Year'], 100-women_degrees[major_cats[sp]], c=cb_orange, label='Men')
for key,spine in ax.spines.items():
spine.set_visible(False)
ax.set_xlim(1968, 2011)
ax.set_ylim(0,100)
ax.set_title(major_cats[sp])
ax.tick_params(top = False, bottom = False, right = False, left = False)
plt.legend(loc='upper right')
plt.show()
cb_dark_blue = (0/255, 107/255, 164/255)
cb_orange = (255/255, 128/255, 14/255)
fig = plt.figure(figsize=(12, 12))
for sp in range(0,4):
ax = fig.add_subplot(2,2,sp+1)
# Set the line width when specifying how each line should look.
ax.plot(women_degrees['Year'], women_degrees[major_cats[sp]], c=cb_dark_blue, label='Women', linewidth=3)
ax.plot(women_degrees['Year'], 100-women_degrees[major_cats[sp]], c=cb_orange, label='Men', linewidth=3)
# Formatting Axes Objects
for key,spine in ax.spines.items():
spine.set_visible(False)
ax.set_xlim(1968, 2011)
ax.set_ylim(0,100)
ax.set_title(major_cats[sp])
ax.tick_params(top = False, bottom = False, right = False, left = False)
plt.legend(loc='upper right')
plt.show()
stem_cats = ['Engineering', 'Computer Science', 'Physical Sciences', 'Math and Statistics', 'Biology', 'Psychology']
fig = plt.figure(figsize=(18, 3))
for sp in range(0,6):
ax = fig.add_subplot(1,6,sp+1)
ax.plot(women_degrees['Year'], women_degrees[stem_cats[sp]], c=cb_dark_blue, label='Women', linewidth=3)
ax.plot(women_degrees['Year'], 100-women_degrees[stem_cats[sp]], c=cb_orange, label='Men', linewidth=3)
for key,spine in ax.spines.items():
spine.set_visible(False)
ax.set_xlim(1968, 2011)
ax.set_ylim(0,100)
ax.set_title(stem_cats[sp])
ax.tick_params(top = False, bottom = False, right = False, left = False)
plt.legend(loc='upper right')
plt.show()
cb_dark_blue = (0/255, 107/255, 164/255)
cb_orange = (255/255, 128/255, 14/255)
fig = plt.figure(figsize=(18, 3))
for sp in range(0,6):
ax = fig.add_subplot(1,6,sp+1)
ax.plot(women_degrees['Year'], women_degrees[stem_cats[sp]], c=cb_dark_blue, label='Women', linewidth=3)
ax.plot(women_degrees['Year'], 100-women_degrees[stem_cats[sp]], c=cb_orange, label='Men', linewidth=3)
if sp == 0:
ax.text(2005, 87, 'Men')
ax.text(2002, 8, 'Women')
elif sp == 5:
ax.text(2004, 13, 'Men')
ax.text(1999, 83, 'Women')
for key,spine in ax.spines.items():
spine.set_visible(False)
ax.set_xlim(1968, 2011)
ax.set_ylim(0,100)
ax.set_title(stem_cats[sp])
ax.tick_params(top = False, bottom = False, right = False, left = False)
plt.show()
stem_cats = ['Engineering', 'Computer Science', 'Psychology',
'Biology', 'Physical Sciences', 'Math and Statistics']
lib_arts_cats = ['Foreign Languages', 'English', 'Communications and Journalism',
'Art and Performance', 'Social Sciences and History']
other_cats = ['Health Professions', 'Public Administration', 'Education',
'Agriculture','Business', 'Architecture']
fig = plt.figure(figsize=(15, 18))
def create_axes_objects(axes_range, column, data, Annot_1_M, Annot_1_F, Annot_2_M, Annot_2_F):
for num in range(axes_range):
# Plotting Data
ax = fig.add_subplot(6, 3, (num*3)+column)
ax.plot(women_degrees['Year'], women_degrees[data[num]], c=cb_dark_blue, label='Women', linewidth=3)
ax.plot(women_degrees['Year'], 100-women_degrees[data[num]], c=cb_orange, label='Men', linewidth=3)
# Formatting Axes Objects
for key,spine in ax.spines.items():
spine.set_visible(False)
ax.set_xlim(1968, 2011)
ax.set_ylim(0,100)
ax.set_title(data[num])
ax.tick_params(bottom=False, top=False, left=False, right=False)
if num == 0:
ax.text(2006, Annot_1_M, 'Men')
ax.text(2003, Annot_1_F, 'Women')
elif num == (axes_range - 1):
ax.text(2006, Annot_2_M, 'Men')
ax.text(2003, Annot_2_F, 'Women')
create_axes_objects(6, 1, stem_cats, 87, 8, 62, 35)
create_axes_objects(5, 2, lib_arts_cats, 20, 75, 56, 38)
create_axes_objects(6, 3, other_cats, 7, 90, 62, 36)
stem_cats = ['Engineering', 'Computer Science', 'Psychology',
'Biology', 'Physical Sciences', 'Math and Statistics']
lib_arts_cats = ['Foreign Languages', 'English', 'Communications and Journalism',
'Art and Performance', 'Social Sciences and History']
other_cats = ['Health Professions', 'Public Administration', 'Education',
'Agriculture','Business', 'Architecture']
fig = plt.figure(figsize=(15, 18))
def create_axes_objects(axes_range, column, data, Annot_1_M, Annot_1_F, Annot_2_M, Annot_2_F):
for num in range(axes_range):
# Plotting Data
ax = fig.add_subplot(6, 3, (num*3)+column)
ax.plot(women_degrees['Year'], women_degrees[data[num]], c=cb_dark_blue, label='Women', linewidth=3)
ax.plot(women_degrees['Year'], 100-women_degrees[data[num]], c=cb_orange, label='Men', linewidth=3)
# Formatting Axes Objects
for key,spine in ax.spines.items():
spine.set_visible(False)
ax.set_xlim(1968, 2011)
ax.set_ylim(0,100)
ax.set_title(data[num])
# Hiding x-axis labels for all charts (labelbottom=False)
ax.tick_params(bottom=False, top=False, left=False, right=False, labelbottom=False)
if num == 0:
ax.text(2006, Annot_1_M, 'Men')
ax.text(2003, Annot_1_F, 'Women')
elif num == (axes_range - 1):
ax.text(2006, Annot_2_M, 'Men')
ax.text(2003, Annot_2_F, 'Women')
ax.tick_params(labelbottom=True)
create_axes_objects(6, 1, stem_cats, 87, 8, 62, 35)
create_axes_objects(5, 2, lib_arts_cats, 20, 75, 56, 38)
create_axes_objects(6, 3, other_cats, 7, 90, 62, 36)
stem_cats = ['Engineering', 'Computer Science', 'Psychology',
'Biology', 'Physical Sciences', 'Math and Statistics']
lib_arts_cats = ['Foreign Languages', 'English', 'Communications and Journalism',
'Art and Performance', 'Social Sciences and History']
other_cats = ['Health Professions', 'Public Administration', 'Education',
'Agriculture','Business', 'Architecture']
fig = plt.figure(figsize=(15, 18))
def create_axes_objects(axes_range, column, data, Annot_1_M, Annot_1_F, Annot_2_M, Annot_2_F):
for num in range(axes_range):
# Plotting Data
ax = fig.add_subplot(6, 3, (num*3)+column)
ax.plot(women_degrees['Year'], women_degrees[data[num]], c=cb_dark_blue, label='Women', linewidth=3)
ax.plot(women_degrees['Year'], 100-women_degrees[data[num]], c=cb_orange, label='Men', linewidth=3)
# Formatting Axes Objects
for key,spine in ax.spines.items():
spine.set_visible(False)
ax.set_xlim(1968, 2011)
ax.set_ylim(0,100)
ax.set_title(data[num])
# Setting y-axis labels for all charts at just 0 and 100 (set_yticks([0, 100]))
ax.set_yticks([0,100])
ax.tick_params(bottom=False, top=False, left=False, right=False, labelbottom=False)
if num == 0:
ax.text(2006, Annot_1_M, 'Men')
ax.text(2003, Annot_1_F, 'Women')
elif num == (axes_range - 1):
ax.text(2006, Annot_2_M, 'Men')
ax.text(2003, Annot_2_F, 'Women')
ax.tick_params(labelbottom=True)
create_axes_objects(6, 1, stem_cats, 87, 8, 62, 35)
create_axes_objects(5, 2, lib_arts_cats, 20, 75, 56, 38)
create_axes_objects(6, 3, other_cats, 7, 90, 62, 36)
stem_cats = ['Engineering', 'Computer Science', 'Psychology',
'Biology', 'Physical Sciences', 'Math and Statistics']
lib_arts_cats = ['Foreign Languages', 'English', 'Communications and Journalism',
'Art and Performance', 'Social Sciences and History']
other_cats = ['Health Professions', 'Public Administration', 'Education',
'Agriculture','Business', 'Architecture']
fig = plt.figure(figsize=(15, 18))
def create_axes_objects(axes_range, column, data, Annot_1_M, Annot_1_F, Annot_2_M, Annot_2_F):
for num in range(axes_range):
# Plotting Data
ax = fig.add_subplot(6, 3, (num*3)+column)
ax.plot(women_degrees['Year'], women_degrees[data[num]], c=cb_dark_blue, label='Women', linewidth=3)
ax.plot(women_degrees['Year'], 100-women_degrees[data[num]], c=cb_orange, label='Men', linewidth=3)
# Formatting Axes Objects
for key,spine in ax.spines.items():
spine.set_visible(False)
ax.set_xlim(1968, 2011)
ax.set_ylim(0,100)
ax.set_title(data[num])
ax.set_yticks([0,100])
# Generating Horizontal Line at position 50, with transparency 0.3, and colors given
ax.axhline(50, c=(171/255, 171/255, 171/255), alpha=0.3)
ax.tick_params(bottom=False, top=False, left=False, right=False, labelbottom=False)
if num == 0:
ax.text(2006, Annot_1_M, 'Men')
ax.text(2003, Annot_1_F, 'Women')
elif num == (axes_range - 1):
ax.text(2006, Annot_2_M, 'Men')
ax.text(2003, Annot_2_F, 'Women')
ax.tick_params(labelbottom=True)
create_axes_objects(6, 1, stem_cats, 87, 8, 62, 35)
create_axes_objects(5, 2, lib_arts_cats, 20, 75, 56, 38)
create_axes_objects(6, 3, other_cats, 7, 90, 62, 36)
stem_cats = ['Engineering', 'Computer Science', 'Psychology',
'Biology', 'Physical Sciences', 'Math and Statistics']
lib_arts_cats = ['Foreign Languages', 'English', 'Communications and Journalism',
'Art and Performance', 'Social Sciences and History']
other_cats = ['Health Professions', 'Public Administration', 'Education',
'Agriculture','Business', 'Architecture']
import matplotlib as mpb
fig = plt.figure(figsize=(15, 18))
def create_axes_objects(axes_range, column, data, Annot_1_M, Annot_1_F, Annot_2_M, Annot_2_F):
for num in range(axes_range):
# Plotting Data
ax = fig.add_subplot(6, 3, (num*3)+column)
ax.plot(women_degrees['Year'], women_degrees[data[num]], c=cb_dark_blue, label='Women', linewidth=3)
ax.plot(women_degrees['Year'], 100-women_degrees[data[num]], c=cb_orange, label='Men', linewidth=3)
# Formatting Axes Objects
for key,spine in ax.spines.items():
spine.set_visible(False)
ax.set_xlim(1968, 2011)
ax.set_ylim(0,100)
ax.set_title(data[num])
ax.set_yticks([0,100])
# Generating Horizontal Line at position 50, with transparency 0.3, and colors given
ax.axhline(50, c=(171/255, 171/255, 171/255), alpha=0.3)
ax.tick_params(bottom=False, top=False, left=False, right=False, labelbottom=False)
if num == 0:
ax.text(2006, Annot_1_M, 'Men')
ax.text(2003, Annot_1_F, 'Women')
elif num == (axes_range - 1):
ax.text(2006, Annot_2_M, 'Men')
ax.text(2003, Annot_2_F, 'Women')
ax.tick_params(labelbottom=True)
create_axes_objects(6, 1, stem_cats, 87, 8, 62, 35)
create_axes_objects(5, 2, lib_arts_cats, 20, 75, 56, 38)
create_axes_objects(6, 3, other_cats, 7, 90, 62, 36)
# print(mpb.get_backend())
plt.savefig("gender_degrees.png")
plt.show()